##################################################
##pathway analysis for GBA AND PINK mutation
## Date: 06/02/2021
## Author: Ahmed Hemedan (ahmed.hemedan@uni.lu)
##################################################

### Install and load required packages

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("clusterProfiler")
BiocManager::install("pathview")
BiocManager::install("enrichplot")
BiocManager::install("org.Hs.eg.db") ##  customized_human_annotation
BiocManager::install("org.Mm.eg.db") ##  customized_mouse_annotation 

library(clusterProfiler)
library(enrichplot)
library(pathview)
library(ggplot2)
library(gprofiler2)
library(biomaRt)

### SET THE DESIRED ORGANISM HERE   
##  for human change to "org.Hs.eg.db"

##  customized_mouse_data 
organism = "org.Hs.eg.db" 
library(organism, character.only = TRUE)

### reading in data from deseq2
sys_data = read.csv("", header=TRUE)

### we want the log2 fold change 
original_gene_list <- sys_data$log2FoldChange

### name the vector
names(original_gene_list) <- sys_data$Gene.stable.ID

### omit any NA values 
gene_list<-na.omit(original_gene_list)

### sort the list in decreasing order (required for clusterProfiler)
gene_list = sort(gene_list, decreasing = TRUE)

### Convert gene IDs for gseKEGG function
##  We will lose some genes here because not all IDs will be converted
ids<-bitr(names(original_gene_list), fromType = "ENSEMBL", toType = "ENTREZID", OrgDb=organism)

### remove duplicate IDS (here I use "ENSEMBL", but it should be whatever was selected as keyType)
dedup_ids = ids[!duplicated(ids[c("ENSEMBL")]),]

### Create a new dataframe sys_data2 which has only the genes which were successfully mapped using the bitr function above
sys_data2 = sys_data[sys_data$Gene.stable.ID %in% dedup_ids$ENSEMBL,]

### Create a new column in sys_data2 with the corresponding ENTREZ IDs
sys_data2$Y = dedup_ids$ENTREZID

###Kegg_enrichment
## Data preparation
## Create a vector of the gene unuiverse
kegg_gene_list <- sys_data2$log2FoldChange

## Name vector with ENTREZ ids
names(kegg_gene_list) <- orthologs_conversion$entrezgene_id

## omit any NA values 
kegg_gene_list<-na.omit(kegg_gene_list)


## sort the list in decreasing order (required for clusterProfiler)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)

kegg_organism = "hsa"
kegg_enrichment <- gseKEGG(kegg_gene_list, kegg_organism,
               nPerm        = 10000,
               minGSSize    = 3,
               maxGSSize    = 800,
               pvalueCutoff = 0.05,
               pAdjustMethod = "none",
               keyType       = "ncbi-geneid")
write.csv(kegg_enrichment, file = "kegg_enrichment.csv", row.names = FALSE)

#Dotplot for the activated and suppressed pathways
pdf("dotplot.pdf") 
dotplot(kegg_enrichment, showCategory = 10, title = "Enriched Pathways" , split=".sign") + facet_grid(.~.sign)
dev.off()

## enrichment map organizes enriched terms into a network with edges connecting overlapping gene sets
pdf("emapplot.pdf")
emapplot(kegg_enrichment)
dev.off()

## categorySize can be either 'pvalue' or 'geneNum'
pdf("cnetplot.pdf")
cnetplot(kegg_enrichment, categorySize="pvalue", foldChange=gene_list)
dev.off()

## ridgeplot to interpret up/down-regulated pathways
pdf("ridgeplot.pdf")
ridgeplot(kegg_enrichment) + labs(x = "enrichment distribution")
dev.off()

## produce the native KEGG plot (PNG) #change the id according to the results
hse <- pathview(gene.data=kegg_gene_list, pathway.id=" ", species = kegg_organism)

# Produce a different plot (PDF) (not displayed here)
hse <- pathview(gene.data=kegg_gene_list, pathway.id=" ", species = kegg_organism, kegg.native = F)


